For the evaluation of the different blood collection tubes, blood was drawn from 9 healthy volunteers (3 volunteers for the preservation tubes, 3 volunteers for the non-preservation plasma tubes and 3 volunteers for the non-preservation serum tubes). The order of all blood tubes was randomized per donor. All blood tubes were filled to the volume recommended by the manufacturer. We refer to the manuscript and supplemental files for more information about experimental setup.
Preservation tubes. We included 5 different preservation tubes (Streck cell-free RNA BCT, Streck cell-free DNA BCT, PAXgene Blood ccfDNA Tube, Roche cell-free DNA Collection tube and Biomatrica LBgard blood tube) with processing at 3 different timepoints after blood collection (immediately (T0), after 24 hours (T24) and after 72 hours (T72)). Thus, 15 blood tubes were drawn per healthy volunteer.
Non-preservation plasma tubes. We included 4 different non-preservation plasma tubes (BD Vacutainer K2E EDTA spray, Vacuette EDTA gel, BD Vacutainer ACD-A and Vacuette Sodium citrate 3.2%) with processing at 3 different timepoints after blood collection (immediately (T0), after 4 hours (T4) and after 16 hours (T16)). Thus, 12 blood tubes were drawn per healthy volunteer.
Non-preservation serum tube. We included 1 non-preservation serum tube (BD Vacutainer SST II Advance) with processing at 3 different timepoints after blood collection (immediately (T0), after 4 hours (T4) and after 16 hours (T16)). Thus, 3 blood tubes were drawn per healthy volunteer. All tubes were collected from one antecubital vein with the BD Vacutainer push button 21G with 12” tube butterfly needle system.
When 15 tubes were collected, 7 tubes were collected from one antecubital vein and 8 tubes were collected from the contralateral antecubital vein with the BD Vacutainer push button 21G, 7” tube with pre-attached holder tube butterfly needle system. When 12 tubes were collected, 6 tubes were collected from one antecubital vein and 6 tubes were collected from the contralateral antecubital vein with the BD Vacutainer push button 21G with 12” tube butterfly needle system.
Five metrics were evaluated. In order to select the tubes which remains most stable across time points, we calculated for every tube and donor the fold change across different timepoints (excluding T24-T72 or T04-T16). Thus, six fold-change values were obtained per tube. These were visualized and tubes were ordered based on the mean of these six values.
An overview of the mean fold changes is presented at the end of this analysis (see Fold change summary of 5 performance metrics)
First, basic parameters are set up in this RMarkdown, such as loading dependencies, setting paths and setting up a uniform plot structure.
Most plots in this RMarkdown file are made interactive with ggplot. PDF versions of most plots are in this GitHub repository under ./plots/
# fold change function + make plots
generateFC <- function(input_df, variable, dfName, AC = FALSE){
fold_change_input_all <- data.frame()
for (duplicate_type in unique(sample_annotation$Tube)){
sample_duplicates<-sample_annotation%>% filter(Tube==duplicate_type)
if(nrow(sample_duplicates)>1){
#print(duplicate_type)
#double_positives_sample <-double_positives %>% dplyr::select(MIMATID,sample_duplicates$UniqueID)
input_df_sample <- left_join(sample_duplicates, input_df)
# only keep the replicates of one type
samples_comb <- combn(sample_duplicates$UniqueID,2) #compare 2 of the 3 samples at a time
for (n_col in 1:ncol(samples_comb)) {
#print(samples_comb[,n_col])
nr_runA <- gsub("^[A-Z]+","",sample_annotation[sample_annotation$UniqueID== samples_comb[1,n_col],]$TimeLapse)
nr_runB <- gsub("^[A-Z]+","",sample_annotation[sample_annotation$UniqueID== samples_comb[2,n_col],]$TimeLapse)
RepA <- sample_annotation[sample_annotation$UniqueID== samples_comb[1,n_col],]$BiologicalReplicate
RepB <- sample_annotation[sample_annotation$UniqueID== samples_comb[2,n_col],]$BiologicalReplicate
if ((RepA == RepB)){
varname <- paste0("T",nr_runA,"_",nr_runB) #make a name so you can backtrace which replicates are compared
#print(varname)
compareVar <- input_df %>% filter(UniqueID == paste(samples_comb[1,n_col]) | UniqueID == paste0(samples_comb[2,n_col]))
if (AC == TRUE){
plot = "absolute change"
intercept = 0
if (is.na((abs(compareVar[[variable]][1]) > abs(compareVar[[variable]][2])))){
correlation_samples<- compareVar %>%
mutate(foldchange=NA)
}
else if (abs(compareVar[[variable]][1]) >= abs(compareVar[[variable]][2])){
correlation_samples<- compareVar %>%
mutate(foldchange= abs(compareVar[[variable]][1])-abs(compareVar[[variable]][2]))
} else {
correlation_samples<- compareVar %>%
mutate(foldchange= abs(compareVar[[variable]][2])-abs(compareVar[[variable]][1]))
}}
else if (AC == FALSE){
plot = "fold change"
intercept = 1
if (is.na((abs(compareVar[[variable]][1]) > abs(compareVar[[variable]][2])))){
correlation_samples<- compareVar %>%
mutate(foldchange=NA)
}
else if (abs(compareVar[[variable]][1]) >= abs(compareVar[[variable]][2])){
correlation_samples<- compareVar %>%
mutate(foldchange= abs(compareVar[[variable]][1])/abs(compareVar[[variable]][2]))
} else {
correlation_samples<- compareVar %>%
mutate(foldchange= abs(compareVar[[variable]][2])/abs(compareVar[[variable]][1]))
}}
FC_input<-correlation_samples %>%
mutate(Tube=duplicate_type, Replicates=varname, BiologicalReplicate = paste0(RepA,"-",RepB)) %>%
mutate(ReplID = paste0(Tube,"-",Replicates))
FC_input <- FC_input %>% mutate(inputType = paste0(dfName))
if (nrow(FC_input) == 2){
fold_change_input_all <- rbind(fold_change_input_all, as.data.frame(FC_input))
}
}
}
}
}
FC <- fold_change_input_all %>% dplyr::group_by(Tube,Replicates,BiologicalReplicate) %>%
mutate(ReplID = paste0(Tube,"-",Replicates)) %>% select(c(ReplID, foldchange, Replicates, inputType)) %>% distinct(ReplID, .keep_all = TRUE)
FC$TimePoint <- ifelse(FC$Replicates %in% c("T24_0", "T0_24", "T0_04", "T04_0"), "1st timepoint", ifelse(FC$Replicates %in% c("T72_0", "T0_72", "T0_16", "T16_0"), "2nd timepoint", NA))
FC$TubeType <- ifelse(FC$Tube %in% c("DNA Streck", "Biomatrica", "RNA Streck", "PAXgene", "Roche"), "preservation", "non-preservation")
print(ggplot(FC %>% filter(!is.na(TimePoint)), aes(x=reorder(Tube,foldchange, FUN = function(x) -mean(x, na.rm = TRUE)), y = foldchange)) +
geom_boxplot() +
geom_beeswarm(groupOnX=TRUE, size = 2.5, aes(col = TimePoint, shape = TubeType)) +
geom_hline(yintercept = intercept, linetype = "dashed", color = "gray") +
labs(subtitle = "ordered by mean (white triangle)", title = paste0(plot, " of ", dfName), y = paste0(plot), x = NULL, col = "comparison", shape = "tube type") +
scale_fill_manual(values=color_panel) +
theme_point +theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1)) +
stat_summary(
geom = "point",
fun.y = "mean",
col = "black",
size = 2,
shape = 24,
fill = "white"
)
)
return(FC)
}Counts obtains were preprocessed and merged into seperate count matrices (see makesFiles.R and Pipeline parameters and MultiQC reports).
miRNAs <- data.table::fread(paste0(data_path, "allmiRs_subs.txt"), header=T, data.table = F)
spikes <- data.table::fread(paste0(data_path, "allspikes_subs.txt"), header=T, data.table = F)
reads <- data.table::fread(paste0(data_path, "allreads_subs.txt"), header=T, data.table = F)
contam <- data.table::fread(paste0(data_path, "allcontam_subs.txt"), header=T, data.table = F)Hemolysis was measured with Nanodrop (absorbance at 414 nm) in plasma across all tubes. Overall, we can see that there is less hemolysis in non-preservation tubes and there is more hemolysis in preservation tubes and at later timepoints. Hemolysis is also our first metric. Furthermore, Spearman rank correlation between replicate platelet measurements is high (R = 0.82).
p1 <- ggplot(sample_annotation,aes(x=TimeLapse,y=Hemolysis_PFP, col=BiologicalReplicate, group = BiologicalReplicate))+
geom_point()+geom_line(aes(group = BiologicalReplicate))+
theme_point+
facet_wrap(~Tube, ncol = 5, scales="free_x") +
theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
scale_y_continuous(limits = c(0, NA))+
scale_color_manual(values=color_panel) +
theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) + labs(title="hemolysis in PFP - split by tube", y = "abs at 414 nm", col = "", x = NULL)
ggplotly(p1)ggsave("./plots/hemolysis_PFP_lineplot_byTube.pdf", plot = p1, dpi = 300)
hemolysis_lineplot <- ggplot2::last_plot()
p1 <- ggplot(sample_annotation,aes(x=TimeLapse,y=Hemolysis_PFP, col=Tube, group = BiologicalReplicate))+
geom_point()+geom_line()+
theme_point+
facet_wrap(~BiologicalReplicate, ncol = 5, scales="free_x") +
theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
scale_y_continuous(limits = c(0, NA))+
scale_color_manual(values=color_panel) +
theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) + labs(title="hemolysis in PFP - split by donor", y = "abs at 414 nm", col = "", x = NULL)
ggplotly(p1)tmp_summary <- sample_annotation %>% group_by(GraphTube) %>% dplyr::summarize(mean_Hemolysis_PFP= mean(Hemolysis_PFP), sd_Hemolysis_PFP=sd(Hemolysis_PFP)) %>% mutate(CV_Hemolysis_PFP = sd_Hemolysis_PFP/mean_Hemolysis_PFP*100) %>% left_join(unique(dplyr::select(sample_annotation, c("Tube","GraphTube"))), by="GraphTube")
FC <- generateFC(input_df = sample_annotation %>% filter(!is.na(Hemolysis_PFP)), variable = "Hemolysis_PFP", dfName = "hemolysis in PFP")An overview of the samples included in this experiment.
First, we performed “preruns” with a Nextseq 500 mid-output kit, to determine whether the samples could be equimolarly pooled. This seemed not be the case, the spread of the % reads identified was too large (1000 fold difference). Thus, we decided to pool based on the concentration of the RC and LP spikes, measured with qPCR. For additional information, see manuscript and supplemental files.
For platelet and hemolysis measurements, see exRNAQC005 (blood tube study RNA exome, mRNA).
Indexing QC. After pooling based on spike concentration, there is a very uneven distribution of sequencing reads, with up to 1000 fold difference between the highest and the lowest sample.
p1 <- ggplot(sample_annotation,aes(x=SampleID,y=index_prct, color = Tube))+
geom_point()+
ylim(0,NA) + theme_bar+ theme(axis.text.x=element_blank()) +
labs(y = "% reads per sample", title="basespace indexing %", col = NULL, fill = NULL)+
scale_color_manual(values=color_panel) + facet_wrap(~ RunID, scales = "free")
ggplotly(p1)ggsave("./plots/BasespaceIndexingPrct.pdf", plot = p1, dpi = 300)
p1 <- ggplot(sample_annotation,aes(x=TimeLapse,y=index_prct, col=BiologicalReplicate, group = BiologicalReplicate))+
geom_point()+geom_line()+
theme_point+
facet_wrap(~Tube, ncol = 5, scales="free_x") +
theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
scale_y_continuous(limits = c(0, NA))+
scale_color_manual(values=color_panel) +
theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))+
labs(y = "% reads per sample", title="basespace indexing %", col = "", x = NULL)+
scale_y_log10()
ggplotly(p1)files<-list.files(log_path,recursive=TRUE)
files<-files[grep("*line_count.txt", files)]
reads_df <- data.frame(total_reads = as.numeric(), total_reads_after_qc = as.numeric(), UniqueID = as.character())
for(i in 1:length(files)){
name_sample <- unlist(strsplit(unlist(strsplit(files[i], "/"))[1], "-"))[1]
#name_sample<-gsub(".*/","",sub("-[0-9]*$","",sub("_spikes.txt","",files[i])))
tmp<-read.table(paste0(log_path,files[i]), header=T, sep="\t", comment.char = "", skip = 0)
#tmp<-read.table(paste0(log_path,files[i]), header=F, sep=":", comment.char = "", skip = 4, nrows = 3)
total_reads_after_qc <- (tmp$qc_lines)/4
total_reads_start <- (tmp$orig_lines)/4
tmp <- data.frame(UniqueID = name_sample, total_reads = total_reads_start, total_reads_after_qc = total_reads_after_qc)
reads_df <- bind_rows(reads_df, tmp)
}
files<-list.files(log_path,recursive=TRUE)
files<-files[grep("*read_count_new", files)]
mapped_df <- data.frame(aligned_reads = as.numeric(), UniqueID = as.character())
for(i in 1:length(files)){
name_sample <- unlist(strsplit(unlist(strsplit(files[i], "/"))[1], "-"))[1]
tmp<-read.table(paste0(log_path,files[i]), header=T, sep="\t", comment.char = "", skip = 0)
mapped_reads <- tmp[1,2]
tmp <- data.frame(UniqueID = name_sample, aligned_reads = mapped_reads)
mapped_df <- bind_rows(mapped_df, tmp)
}
mapped_df <- merge(reads_df, mapped_df, by = "UniqueID")
mappedSpikes <- colSums(spikes[,-1], na.rm = TRUE) %>% melt()
mappedSpikes$UniqueID <- rownames(mappedSpikes)
colnames(mappedSpikes) <-c("spikeCounts", "UniqueID")
mapped_df <- merge(mapped_df, mappedSpikes)
mapped_df$aligned_reads_plus_spikes <- mapped_df$aligned_reads+ mapped_df$spikeCounts
mapped_df$prct_aligned <- (mapped_df$aligned_reads/800000)*100
mapped_df$prct_aligned_plus_spikes <- (mapped_df$aligned_reads_plus_spikes/800000)*100
mapped_df_annot <- merge(sample_annotation, mapped_df, by = "UniqueID")
mapped_df_melt <-mapped_df %>% melt(value.name = "reads")
mapped_df_melt_annot <- full_join(mapped_df_melt, sample_annotation, by = "UniqueID")
mapped_df_save <- merge(sample_annotation %>% select(UniqueID, SampleID), mapped_df, by = "UniqueID")
write_tsv(mapped_df_save, "./smallRNA_table.tsv")MultiQC reports of the pipeline can be found in this GitHub repository under logs/20200117.logs/*mulitqc.html
The script of the pipeline to see the parameters can be found in this GitHub repository under logs/20200117.logs/smallRNASeq_preprocessing.py and logs/20200117.logs/submitJob.sh.
All fastq files (after adaptor trimming and other QC) were subsampled to 800 000 reads (based on the sample with the lowest amount of reads across all samples). This results in a substantial reduction of reads across all samples. Reads were subsampled after adaptor trimming and other QC.
pd <- position_dodge(0.2)
p1 <- ggplot(mapped_df_melt_annot %>% filter(variable != "prct_aligned"),aes(x=reorder(variable, desc(variable)),y=reads, group = PlasmaID, color = Tube))+
geom_line(position = pd)+
geom_point(position = pd) +
facet_wrap(~RunID, ncol = 3)+
labs(title="Reads per sequencing run", y = "# reads", x = "stage in pipeline", col = NULL) +
theme_bar+
#facet_wrap(~TimeLapse+TubeType, scales = "free", ncol = 3)+
theme(axis.text.x=element_text(angle=90, hjust=1)) +
scale_x_discrete(labels = c("total_reads", "total_reads_after_qc", "aligned_reads_after_subSamp"), limits=c("total_reads", "total_reads_after_qc", "aligned_reads"))+
scale_color_manual(values=color_panel) + scale_y_continuous(trans = "log10", lim = c(1e4,1e8))
ggplotly(p1)Further visualisation of the percentage of mapped reads shows varying mapping percentage. Overall, mapping % is lower in non-preservation tubes. In some conditions, we can also see a time-impact of the mapping percentage (e.g. RNA Streck).
We investigated the impact of the mapping percentage in several ways:
Overall, the consensus was to further proceed with the data-analysis and not to subsample based on mapped percentage.
p1 <- ggplot(mapped_df_annot, aes(x=GraphTube,y=prct_aligned,col=Tube, text= UniqueID)) +
geom_point() +
theme_point +
facet_wrap(~TubeType, scales = "free", ncol = 3)+
theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1)) +
scale_y_continuous(limits=c(0,100),labels=full_nr) +
scale_color_manual(values=color_panel2) +
labs(x="",y="mapped reads (% total)", title = "percentage mapped reads", col = "tube")
ggplotly(p1, tooltip = "text")ggsave("./plots/mappingPrct.pdf", plot = p1, dpi = 300)
p1 <- ggplot(mapped_df_annot,aes(x=TimeLapse,y=prct_aligned, col=BiologicalReplicate, group = BiologicalReplicate))+
geom_point()+geom_line()+
theme_point+
facet_wrap(~Tube, ncol = 5, scales="free_x") +
theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
scale_y_continuous(limits = c(0, NA))+
scale_color_manual(values=color_panel) +
theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))+
labs(x="",y="mapped reads (% total)", title = "percentage mapped reads",col = "", x = "time point")
ggplotly(p1)ggsave("./plots/mappingPrct_lineplot.pdf", plot = p1, dpi = 300)
p1 <- ggplot(mapped_df_annot, aes(x = total_reads, y = prct_aligned)) + geom_point(aes(color = Tube)) + theme_point + labs(y = "% mapped", x= "total reads", title = "mapping percentage vs total raw reads", col = NULL) + scale_color_manual(values=color_panel)
ggplotly(p1)ggsave("./plots/mappingPrct_vs_rawreads.pdf", plot = p1, dpi = 300)
# lowest number of reads ( = level to subsample to)
print("Level to subsample to:")## [1] "Level to subsample to:"
## [1] 842166
spikes_sum <- spikes %>% gather(key="UniqueID",value="spikecounts",-"spikeID") %>%
group_by(UniqueID) %>% dplyr::summarise(counts=sum(spikecounts, na.rm=T)) %>%
#spread(key=UniqueID,value=spikesum) %>%
mutate(reads="spikes")
# add sum of spikes to the total mapped reads (these were not considered as "mapped" yet)
reads_complete <- reads %>% gather(key="UniqueID",value="counts",-"reads") %>%
rbind(., spikes_sum) %>%
spread(key=reads, value=counts) %>%
mutate(all_mapped = mapped + spikes) %>%
gather(key="reads",value="counts",-"UniqueID")We add two types of spikes:
Around 0-5% of all reads are going to spike counts. Like in RNA Exome study (exRNAQC005), the number of reads going to spikes is a marker for RNA content in either eluate or plasma (assuming isolation or prep efficiency is similar in all samples, likely to be the case as the same methods were used).
spikesum<-gather(spikes, key="UniqueID",value="counts",-"spikeID") %>%
mutate(spiketype=sub("-.*","",spikeID)) %>% group_by(UniqueID,spiketype) %>%
dplyr::summarise(spikesum=sum(counts,na.rm = TRUE))
perc_spikes<-full_join(reads_complete %>% filter(reads=="all_mapped"), spikesum,by="UniqueID") %>%
dplyr::select(-reads)
perc_spikes<-perc_spikes %>% dplyr::mutate(perc=spikesum/counts*100)
#write_tsv(perc_spikes,"percentage_spikes.txt")
perc_spikes<-left_join(perc_spikes,sample_annotation, by="UniqueID")
p1 <- ggplot(perc_spikes, aes(x=GraphTube,y=perc,col=Tube, text= UniqueID)) +
geom_point() +
theme_point +
facet_wrap(~TubeType, scales = "free_x", ncol = 3)+
theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1)) +
scale_y_continuous(limits=c(NA,20),labels=full_nr) +
scale_color_manual(values=color_panel2) +
labs(x="",y="% of all going to spikes", title = "percentage of all reads going to RC or LP spikes", col = NULL)
ggplotly(p1)## Uniformity of spikes
spikes_annotated<-left_join(melt(spikes) %>% dplyr::rename(UniqueID=variable),sample_annotation)
spikes_annotated[is.na(spikes_annotated)] <- 0
spikeconc<-read_tsv("../../Resources/spike_conc.txt")## Abundance compared to miRNAs
all_miRs_spikes<-bind_rows(miRNAs %>%dplyr::rename(spmiRID=MIMATID),spikes %>% dplyr::rename(spmiRID=spikeID))
all_miRs_spikes_melt<-gather(all_miRs_spikes, key="UniqueID",value="counts",-"spmiRID") %>%
mutate(type=sub("-.*","",sub("[0-9]*$","",spmiRID))) %>% dplyr::group_by(UniqueID) %>%
mutate(ordermir=row_number(counts)) %>% # dplyr::row_number ranks the values (ascending order) with ties.method = "first"
left_join(sample_annotation, by="UniqueID")The volume of RC spikes added is proportional to plasma input volume (1 µL RC / 100 µL plasma)
#sum all counts for endogenous RNA (miRNA + contam), LP and RC respectively
####??? should we use mapped here instead of miRNA + contam?
# gene_level_ratios <- rbind(miRNAs %>% dplyr::rename(ensembl_id=MIMATID),
# contam %>% dplyr::select(-c("contam"))) %>%
# mutate(type="endogenous") %>% group_by(type) %>%
# dplyr::summarise_if(is.numeric, sum, na.rm=TRUE) %>%
# rbind(., spikes %>% mutate(type=gsub(".-..$","",spikeID)) %>%
# group_by(type) %>% dplyr::summarise_if(is.numeric, sum, na.rm=TRUE)) %>%
gene_level_ratios <- rbind(reads %>% filter(reads=="mapped_miR") %>%
mutate(type="endogenous") %>% group_by(type) %>%
dplyr::summarise_if(is.numeric, sum, na.rm=TRUE),
spikes %>% mutate(type=gsub(".-..$","",spikeID)) %>%
group_by(type) %>%
dplyr::summarise_if(is.numeric, sum, na.rm=TRUE)) %>%
gather(., key="UniqueID",value="counts",-type) %>% #long format
spread(., key = "type", value="counts") %>%
mutate("LPvsEndo"=LP/endogenous,
"RCvsEndo"=RC/endogenous,
"EndovsRC"=endogenous/RC,
"EndovsLP"= endogenous/LP) %>%
left_join(., sample_annotation %>% dplyr::select(c("UniqueID","Tube","SampleID","GraphTube","EluateV","PlasmaInputV", "BiologicalReplicate", "TimeLapse")), by="UniqueID") #add annotation
spikes1 <- ggplot(gene_level_ratios, aes(x=GraphTube, y=EndovsRC, fill=Tube, colour=Tube)) +
#geom_bar(stat="identity") +
geom_point(size=1.2) +
labs(x="", y="Endogenous/RC") +
scale_fill_manual(values=color_panel) +
scale_colour_manual(values=color_panel) +
theme_bar +
labs(x="", y="endogenous/RC", title=NULL, col = NULL, fill = NULL)
#calculate statistics for Sequin/endogenous (sd, sem, 95% ci)
test <- log_rescaling_CI(gene_level_ratios, measurevar="EndovsRC", groupvar=c("GraphTube"))
# Plot LP/endo in log10 scale
spikes2 <- ggplot(test, aes(x=GraphTube, y=10^measurevar_log_resc)) +
#geom_bar(position=position_dodge(), stat="identity")+
geom_errorbar(aes(ymin=ci_lower_oriscale, ymax=ci_upper_oriscale), colour="grey", width=.1) +
geom_point(aes(colour=Tube), size=1.2) +
geom_point(data=test, aes(x=GraphTube, y=mean_oriscale), shape=4, colour="grey") +
geom_hline(yintercept = 1, linetype="dashed",colour="grey88") +
labs(x="", y="relative small RNA conc. (rescaled)", title=NULL, col = NULL, fill = NULL) +
scale_colour_manual(values=color_panel) +
scale_y_log10() +
theme_bar
ggplotly(spikes1)ggsave("./plots/endoVsRC_CI.pdf", plot = ggplot2::last_plot(), dpi = 300)
p1 <- ggplot(test,aes(x=TimeLapse,y=10^measurevar_log_resc, col=BiologicalReplicate, group = BiologicalReplicate))+
geom_point()+geom_line()+
theme_point+
facet_wrap(~Tube, ncol = 5, scales="free_x") +
theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
scale_y_continuous(limits = c(0, NA))+
scale_color_manual(values=color_panel) +
theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) +
scale_y_log10() +
labs(x="", y="relative endogenous/RC", title="Relative small RNA conc. in plasma based on RC counts (rescaled to lowest)", col = NULL, x = NULL)
ggplotly(p1)ggsave("./plots/endoVsRC_lineplot.pdf", plot = ggplot2::last_plot(), dpi = 300)
RNAconc_lineplot <- ggplot2::last_plot()In this metric, we can observe that the RNA concentration in the plasma is much more variable in the preservation tubes (except for the Roche tube), while the fold changes in the non-preservation tubes are very close to the expected value of 1.
The ratio of endogenous RNA to LP reflects the concentration of endogenous RNA in the eluate. More endogenous RNA in eluate after RNA isolation results in proporionally fewer reads going to LP spikes, and thus a higher ratio of endogenous/LP.
# miRNAs <- kallisto %>% filter(!str_detect(ensembl_transcript_id,"LP|R1|R2")) %>% group_by(MIMATID) %>% summarise_if(is.numeric, sum, na.rm=TRUE)
spikes1 <- ggplot(gene_level_ratios, aes(x=GraphTube, y=LPvsEndo, colour=Tube)) +
#geom_bar(stat="identity") +
geom_point(size=1.2) +
scale_fill_manual(values=color_panel) +
scale_colour_manual(values=color_panel) +
theme_bar +
labs(x="", y="LP/endogenous", title="inverse of RNA concentration", subtitle="LP vs Endogenous RNA", colour = NULL, fill = NULL)
spikes2 <- ggplot(gene_level_ratios, aes(x=GraphTube, y=EndovsLP, colour=Tube)) +
#geom_bar(stat="identity") +
geom_point(size=1.2) +
scale_fill_manual(values=color_panel) +
scale_colour_manual(values=color_panel) +
theme_bar +
labs(x="", y="endogenous/LP", title=NULL, subtitle=NULL, colour = NULL, fill = NULL)
ggplotly(spikes2)ggsave("./plots/endoVsLP.pdf", plot = ggplot2::last_plot(), dpi = 300)
#grid.arrange(spikes1,spikes2)test <- log_rescaling_CI(gene_level_ratios, measurevar="EndovsLP", groupvar=c("GraphTube"))
# Plot LP/endo in log10 scale
spikes_conc <- ggplot(test, aes(x=GraphTube, y=10^measurevar_log_resc)) +
#geom_bar(position=position_dodge(), stat="identity")+
geom_errorbar(aes(ymin=ci_lower_oriscale, ymax=ci_upper_oriscale), colour="grey", width=.1) +
geom_point(aes(colour=Tube), size=1.2) +
geom_point(data=test, aes(x=GraphTube, y=mean_oriscale), shape=4, colour="grey") +
geom_hline(yintercept = 1, linetype="dashed",colour="grey88") +
labs(x="", y="relative endogenous small RNA concentration", title=NULL, subtitle="endogenous small RNA vs LP (rescaled)", col = NULL) +
scale_colour_manual(values=color_panel) +
scale_y_log10() +
theme_bar
ggplotly(spikes_conc)ggsave("./plots/endoVsLP_CI.pdf", plot = ggplot2::last_plot(), dpi = 300)
tmp_summary <- gene_level_ratios %>% group_by(GraphTube) %>% dplyr::summarize(mean_EndovsLP= mean(EndovsLP), sd_EndovsLP=sd(EndovsLP)) %>% mutate(CV_EndovsLP = sd_EndovsLP/mean_EndovsLP*100) %>% left_join(unique(dplyr::select(sample_annotation, c("Tube","GraphTube"))), by="GraphTube")
p1 <- ggplot(test,aes(x=GraphTube,y=10^measurevar_log_resc, col=BiologicalReplicate, group = BiologicalReplicate))+
geom_point()+geom_line()+
theme_point+
facet_wrap(~Tube, ncol = 5, scales="free_x") +
theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
scale_y_continuous(limits = c(0, NA))+
scale_color_manual(values=color_panel) +
theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) +
scale_y_log10() +
labs(x="", y="relative endogenous small RNA concentration", title="Relative small RNA conc. based on LP spikes", subtitle="endogenous RNA vs LP (rescaled to lowest)", col = "", x = NULL)
ggplotly(p1)As we add an identical amount of RC and LP spikes to the samples, we would expect that the RC counts and LP counts are well correlated, if there are no issues in RNA extraction or library preparation. We can indeed see an overall good spearman correlation of 0.85. However, when inspecting the individual line plots, we can i.e. see that DNA Streck has a very low RC/LP ratio, indicating problems with RNA extraction in this tube.
collapse_unique <- function(x) {
paste(unique(x), collapse = ",")
}
perc_spikes<- spread(perc_spikes, spiketype, spikesum)
perc_spikes<- dplyr::select(perc_spikes, -c("perc"))
perc_spikes <- aggregate(perc_spikes[,-1], list(perc_spikes[,1]), FUN = collapse_unique)
perc_spikes$LP1 <- as.numeric(gsub('\\D+','', perc_spikes$LP1))
perc_spikes$RC1 <- as.numeric(gsub('\\D+','', perc_spikes$RC1))
perc_spikes$RC2 <- as.numeric(gsub('\\D+','', perc_spikes$RC2))
perc_spikes$RC <- perc_spikes$RC1 + perc_spikes$RC2
colnames(perc_spikes)[1] <- "UniqueID"
ggplot(gene_level_ratios, aes(x = EndovsLP, y = EndovsRC, col = Tube)) + geom_point() + theme_point + scale_y_continuous(trans = "log2") + scale_x_continuous(trans = "log2") + labs(title = "Correlation of ") + theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) + labs(x = "endo/LP", y = "endo/RC", title = "correlation of RNA concentration in plasma (RC) and eluate (LP)", col = NULL, fill = NULL) +
geom_smooth(method = "lm", se = TRUE, aes(col = FALSE))+
theme_point + geom_abline(intercept=0, slope=1, color = "gray", linetype = "dashed")+
theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
scale_color_manual(values=color_panel) +
stat_cor(method = "spearman", aes(col = FALSE, label=paste(..r.label.., cut(..p..,breaks = c(-Inf, 0.0001, 0.001, 0.01, 0.05, Inf), labels = c("'****'", "'***'", "'**'", "'*'", "'ns'")), sep = "~")))ggsave("./plots/RCvsLP_corrplot.pdf", plot = ggplot2::last_plot(), dpi = 300)
p1 <- ggplot(perc_spikes,aes(x=GraphTube,y=RC/LP1, col=BiologicalReplicate, group = BiologicalReplicate))+
geom_point()+geom_line()+
theme_point+
facet_wrap(~Tube, ncol = 5, scales="free_x") +
theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
scale_y_continuous(limits = c(0, NA))+
scale_color_manual(values=color_panel) +
theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))+
labs(x="", y = "RC/LP", title="RC/LP",subtitle="", col = "")
ggplotly(p1)The cut-off from exRNAQC011 (kit study), based on technical replicates, was used (here 3 counts). We showed that the cut-off that we proposed stays stable, even if reads are subsampled to 800k.
#cutoff_kit <- data.frame(MIR0.2 = 6, CCF1 = 6, CCF4=6, CIRC0.25 = 13.01, CIRC5 = 5.96, MAX0.1 = 10.87, MIRA0.2 = 9, MIRA0.6 = 6.73, MIRV0.1 = 14.01, MIRV0.625 = 9.01, NUC0.3 = 10.1, NUC0.9 = 7.62)
#cutoff_kit <- gather(cutoff_kit, key = GraphTube, value = median_th)
cutoff_kit <- data.frame(Tube = sample_annotation$Tube, GraphTube = sample_annotation$GraphTube, median_th = 3)#ensembl <- useEnsembl(biomart="ensembl",dataset="hsapiens_gene_ensembl", version = 91)
#genes_ens <- getBM(attributes=c('MIMATID','gene_biotype'),mart=ensembl)
#genes_ens <- data.table::fread(paste0(data_path,"gene_biotypes_ensemblv91.txt"), header=T, data.table = F)
miRNAs_long <- miRNAs %>% gather(., key="UniqueID", value="est_counts", -MIMATID) %>% #long format
left_join(., dplyr::select(sample_annotation,c(UniqueID,GraphTube,SampleID, Tube, BiologicalReplicate)), by="UniqueID")
#keep only protein coding genes with more than 0 counts
miRNAs_long <- miRNAs_long %>% filter(est_counts > 0)
number_miR_detected <- miRNAs_long %>% group_by(SampleID) %>% dplyr::summarize(miR_above0=n())
number_miR_detected <- full_join(number_miR_detected,
miRNAs_long %>% group_by(SampleID) %>%
dplyr::summarize(total_est_counts_above0=sum(est_counts)),
by="SampleID")
#cutoff_kit <- single_pos %>% group_by(GraphTube) %>% dplyr::summarise(median_th = median(filter_threshold))
miRNAs_cutoff <- miRNAs %>% gather(., key="UniqueID", value="est_counts", -MIMATID) %>% #long format
left_join(., dplyr::select(sample_annotation,c(UniqueID,GraphTube,SampleID, Tube, BiologicalReplicate, TimeLapse)), by="UniqueID")
#left_join(., cutoff_kit, by="GraphTube") #add the median cut-off for each kit
miRNAs_cutoff <- miRNAs_cutoff %>%
filter(est_counts > 6)
number_miR_detected <- full_join(number_miR_detected,
miRNAs_cutoff %>% group_by(SampleID) %>%
dplyr::summarize(miR_aboveTh=n()),
by="SampleID")
number_miR_detected <- full_join(number_miR_detected,
miRNAs_cutoff %>% group_by(SampleID) %>%
dplyr::summarize(total_est_counts_aboveTh=sum(est_counts)),
by="SampleID")
number_miR_detected <- left_join(number_miR_detected,
dplyr::select(sample_annotation, c(UniqueID,GraphTube, RNAisolation, EluateV,SampleID, Tube, BiologicalReplicate, TimeLapse)),
by="SampleID")
#convert to the original format
miRNAs_cutoff <- dplyr::select(miRNAs_cutoff, MIMATID, UniqueID, est_counts, Tube, BiologicalReplicate, TimeLapse) %>%
spread(., key=UniqueID, value=est_counts)
#write.csv( miRNAs_cutoff, file="../../../exRNAQC/ miRNAs_cutoff.csv",row.names=FALSE, na="")After filtering, there is a substantial drop in number of miRNAs. Note that we also subsampled to 800k reads, so the number of miRNAs detected is severely skewed due to this subsampling. After filtering, the number of miRNAs is relatively stable across all tubes, despite the mapping issue described above.
p1 <- ggplot(number_miR_detected,aes(x=TimeLapse,y=miR_aboveTh, col=BiologicalReplicate, group = BiologicalReplicate))+
geom_point()+geom_line()+
theme_point+
facet_wrap(~Tube, ncol = 5, scales="free_x") +
theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
scale_y_continuous(limits = c(0, NA))+
scale_color_manual(values=color_panel) +
theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))+
labs(x="", title="absolute number of miRNAs (filtered)",subtitle="", col = "", y = "number of miRNAs above threshold", x = NULL)
ggplotly(p1)In this metric, we observe that the number of miRNAs remains relatively stable in most tubes, except for DNA Streck (fold changes are close to 1).
names(FC)[names(FC) == 'foldchange'] <- 'miRNAcount_FC'
FC_all <- merge(FC %>% select(-inputType), FC_all, on = c("Tube", "BiologicalReplicate", "ReplID", "Replicates"))
ggsave("./plots/number_of_miRNA_unfilt_FC.pdf", plot = ggplot2::last_plot(), dpi = 300)
miRNACount_FC_plot <- ggplot2::last_plot()Contaminants in miRNA data are reads going to other biotypes than miRNA, such as:
These are often unwanted and take up expensive sequencing reads. The more contaminants, the deeper you have to sequence to obtain sufficient reads for miRNAs.
For every sample, we plot the percentage of counts and genes for each biotype. We are usually interested in miRNAs, and the other biotypes are often seen as contaminants (see above).
color_panel3 <- color_panel
color_panel3[length(color_panel3)] <- "darkcyan"
color_panel3[length(color_panel3)+1] <- "darkgray"
#miR
tmp1 <- miRNAs %>% gather(key="UniqueID",value="counts", -"MIMATID") %>%
mutate(type="miRNA") %>%
dplyr::group_by(UniqueID,type) %>%
dplyr::summarise(sum_counts = sum(counts, na.rm=T)) #calculate sum of MIMAT per sample
#contaminants
tmp2 <- gather(contam, key="UniqueID",value="counts",-c("contam","ensembl_id")) %>%
dplyr::select(c("UniqueID","type"="contam","counts")) %>%
mutate(type = gsub("multiple_misc_RNA_snRNA","multiple_snRNA_misc_RNA",type)) %>%
dplyr::group_by(UniqueID, type) %>%
dplyr::summarise(sum_counts=sum(counts,na.rm=T))
#join them together and calculate %
tmp <- rbind(tmp1,tmp2)
#notann
tmp3 <- tmp %>% ungroup() %>% dplyr::group_by(UniqueID) %>%
dplyr::summarise(sum_all = sum(sum_counts, na.rm=T)) %>%
full_join(gather(reads %>% filter(reads=="mapped"), key="UniqueID", value="mapped", -"reads") %>% dplyr::select(-reads), by="UniqueID") %>%
mutate(type="not_annotated",not_ann = mapped-sum_all) %>%
dplyr::select(c("UniqueID","type","sum_counts"="not_ann")) %>% group_by(UniqueID)
perc_all <- rbind(tmp,tmp3) %>%
full_join(reads_complete %>% filter(reads=="mapped") %>%
dplyr::select(-reads), by="UniqueID") %>%
ungroup() %>% dplyr::group_by(UniqueID,type) %>%
mutate(perc=sum_counts/counts*100) %>%
left_join(sample_annotation, by="UniqueID")
perc_all$type[grep("multiple",perc_all$type)]<-"multiple"
p1 <- ggplot(perc_all %>% filter(TubeType == "preservation"),aes(x=SampleID,y=perc,fill=type))+
geom_bar(stat="identity",position = "fill")+
theme_bar+
facet_wrap(~Tube, nrow=1, scales="free_x") +
theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),axis.line.x = element_blank(),axis.ticks.x = element_blank())+
scale_y_continuous(expand=c(0,0))+
scale_fill_manual(values=c("black", color_panel3)) +
labs(title="all biotypes (preservation tubes)", y = "fraction", x = NULL, fill = NULL)
ggplotly(p1)ggsave("./plots/biotypes_preservation_nospikes.pdf", plot = ggplot2::last_plot(), dpi = 300)
p1 <- ggplot(perc_all %>% filter(TubeType == "non-preservation"),aes(x=SampleID,y=perc,fill=type))+
geom_bar(stat="identity",position = "fill")+
theme_bar+
facet_wrap(~Tube, nrow=1, scales="free_x") +
theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),axis.line.x = element_blank(),axis.ticks.x = element_blank())+
scale_y_continuous(expand=c(0,0))+
scale_fill_manual(values=c("black", color_panel3)) +
labs(title="all biotypes (non-preservation tubes)", y = "fraction", x = NULL, fill = NULL)
ggplotly(p1)The previous bar charts are biotypes without spike reads. The next figures shows that spike reads only make up a small percentage of total mapped reads.
tmp1 <- miRNAs %>% gather(key="UniqueID",value="counts", -"MIMATID") %>%
mutate(type="miRNA") %>%
dplyr::group_by(UniqueID,type) %>%
dplyr::summarise(sum_counts = sum(counts, na.rm=T)) #calculate sum of MIMAT per sample
#contaminants
tmp2 <- gather(contam, key="UniqueID",value="counts",-c("contam","ensembl_id")) %>%
dplyr::select(c("UniqueID","type"="contam","counts")) %>%
dplyr::group_by(UniqueID, type) %>%
dplyr::summarise(sum_counts=sum(counts,na.rm=T))
#join them together
tmp <- rbind(tmp1,tmp2)
#notann
tmp3 <- tmp %>% ungroup() %>% dplyr::group_by(UniqueID) %>%
dplyr::summarise(sum_all = sum(sum_counts, na.rm=T)) %>%
full_join(gather(reads %>% filter(reads=="mapped"), key="UniqueID", value="mapped", -"reads") %>% dplyr::select(-reads), by="UniqueID") %>%
mutate(type="not_annotated",not_ann = mapped-sum_all) %>%
dplyr::select(c("UniqueID","type","sum_counts"="not_ann")) %>% group_by(UniqueID)
tmp4 <- spikes %>% gather(key="UniqueID",value="counts",-"spikeID") %>%
ungroup() %>% dplyr::group_by(UniqueID) %>%
dplyr::summarise(sum_counts = sum(counts, na.rm=T)) %>%
mutate(type="spikes") %>%
group_by(UniqueID)
tmpb <- rbind(tmp3,tmp4)
perc_all <- rbind(tmp,tmpb) %>%
full_join(reads_complete %>% filter(reads=="all_mapped") %>%
dplyr::select(-reads), by="UniqueID") %>%
ungroup() %>% dplyr::group_by(UniqueID,type) %>%
mutate(perc=sum_counts/counts*100) %>%
left_join(sample_annotation, by="UniqueID")
perc_all$type[grep("multiple",perc_all$type)]<-"multiple"
p1 <- ggplot(perc_all %>% filter(TubeType == "preservation"),aes(x=SampleID,y=perc,fill=type))+
geom_bar(stat="identity",position = "fill")+
theme_bar+
facet_wrap(~Tube, nrow=1, scales="free_x") +
theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),axis.line.x = element_blank(),axis.ticks.x = element_blank())+
scale_y_continuous(expand=c(0,0))+
scale_fill_manual(values=c("black", color_panel3)) +
labs(title="all biotypes (preservation tubes)", y = "fraction", x = NULL, fill = NULL)
ggplotly(p1)ggsave("./plots/biotypes_preservation_plusSpikes.pdf", plot = ggplot2::last_plot(), dpi = 300)
p1 <- ggplot(perc_all %>% filter(TubeType == "non-preservation"),aes(x=SampleID,y=perc,fill=type))+
geom_bar(stat="identity",position = "fill")+
theme_bar+
facet_wrap(~Tube, nrow=1, scales="free_x") +
theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),axis.line.x = element_blank(),axis.ticks.x = element_blank())+
scale_y_continuous(expand=c(0,0))+
scale_fill_manual(values=c("black", color_panel3)) +
labs(title="all biotypes (non-preservation tubes)", y = "fraction", x = NULL, fill = NULL)
ggplotly(p1)perc_miRs <- miRNAs %>% gather(key="UniqueID",value="counts", -"MIMATID") %>% mutate(type=gsub("[0-9].*$","",MIMATID)) %>%
dplyr::group_by(UniqueID,type) %>%
dplyr::summarise(sum_counts = sum(counts, na.rm=T)) %>% #calculate sum of MIMAT per sample
full_join(reads_complete %>% filter(reads=="all_mapped")) %>%
mutate(perc=sum_counts/counts*100) %>% #calculate perc of miRs and spikes of total mapped reads
left_join(sample_annotation, by="UniqueID")
p1 <- ggplot(perc_all %>% filter(type == "miRNA"),aes(x=TimeLapse,y=perc, col=BiologicalReplicate, group = BiologicalReplicate))+
geom_point()+geom_line()+
theme_point+
facet_wrap(~Tube, ncol = 5, scales="free_x") +
theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
scale_y_continuous(limits = c(0, NA))+
scale_color_manual(values=color_panel) +
theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))+
labs( y="miRNA counts (%)", title="percentage of miRNA counts", x = NULL, col = NULL)
ggplotly(p1)ggsave("./plots/prct_miRNAs_nospikes.pdf", plot = ggplot2::last_plot(), dpi = 300)
biotype_lineplot <- ggplot2::last_plot()If no changes occur in the tube over time, the fraction of the reads going to miRNAs should remain relatively stable. However, we can see in this metric that the preservation tubes perform worse than the non-preservation tubes (except for Biomatrica).
The ALC values are summarized based on the average of the ALC values (after removing NAs). The dot plot at the end gives an overview of all these values.
Score: lower ALC = better
#print(all_plot + labs(title="Pairwise ALCs"))
ALC_melt <- left_join(ALC, sample_annotation[,c("Tube")], by="Tube")
ALC$BiologicalReplicate <- gsub("[-]TUBE[0-9]*","", ALC$BiologicalReplicate)
ALC$Replicates <- gsub("^Rep04_0$","T0-T04", ALC$Replicates)
ALC$Replicates <- gsub("^Rep0_04$","T0-T04", ALC$Replicates)
ALC$Replicates <- gsub("^Rep16_0$","T0-T16", ALC$Replicates)
ALC$Replicates <- gsub("^Rep0_16$","T0-T16", ALC$Replicates)
ALC$Replicates <- gsub("^Rep16_04$","T04-T16", ALC$Replicates)
ALC$Replicates <- gsub("^Rep04_16$","T04-T16", ALC$Replicates)
ALC$Replicates <- gsub("^Rep24_0$","T0-T24", ALC$Replicates)
ALC$Replicates <- gsub("^Rep0_24$","T0-T24", ALC$Replicates)
ALC$Replicates <- gsub("^Rep72_0$","T0-T72", ALC$Replicates)
ALC$Replicates <- gsub("^Rep0_72$","T0-T72", ALC$Replicates)
ALC$Replicates <- gsub("^Rep72_24$","T24-T72", ALC$Replicates)
ALC$Replicates <- gsub("^Rep24_72$","T24-T72", ALC$Replicates)
#ALC_melt <- ALC_melt %>% filter(Replicates %in% c("Rep0_04", "Rep04_16", "Rep0_16", "Rep0_24", "Rep24_72", "Rep0_72"))
ALC$TimePoint <- ifelse(ALC$Replicates %in% c("T24_0", "T0_24", "T0_04", "T04_0"), "1st timepoint", ifelse(ALC$Replicates%in% c("T72_0", "T0_72", "T0_16", "T16_0"), "2nd timepoint", NA))
ALC$TubeType <- ifelse(ALC$Tube %in% c("DNA Streck", "Biomatrica", "RNA Streck", "PAXgene", "Roche"), "preservation", "non-preservation")
ggplot(ALC %>% filter(!is.na(TimePoint)), aes(x=reorder(Tube,ALC_calc, FUN = function(x) -mean(x, na.rm = TRUE)), y = 2^ALC_calc)) +
geom_boxplot() +
geom_beeswarm(groupOnX=TRUE, size = 2.5, aes(col = TimePoint, shape = TubeType)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray") +
labs(subtitle = "ordered by mean", title = paste0("pairwise ALCs"), y = "linear ALC", x = NULL, col = "comparison", shape = "tube type") +
scale_fill_manual(values=color_panel) + theme_point + theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1))+
stat_summary(
geom = "point",
fun.y = "mean",
col = "black",
size = 2,
shape = 24,
fill = "white"
)We can conclude, based on this data-analysis and these metrics, that the preservation tubes are too variable in terms of fold-changes to be of further use. They do not claim what they do: preservation of the content based on T0. The non-preservation tubes were not tested to T72, but these are also not marketed to do that. We chose 16 hours as latest realistic timepoint in a routine lab.
For phase II of the exRNAQC study, we selected three tubes to proceed, i.e. EDTA, citrate and serum, based on their performance and availability in a routine hospital setting.
FC_all_means <- FC_all %>% dplyr::group_by(Tube) %>% dplyr::summarise_all(funs(mean), na.rm = TRUE) %>% select(-c("BiologicalReplicate", "ReplID", "Replicates", "TimePoint", "TubeType"))
ALC_mean$ALC <- 2^ALC_mean$ALC
FC_all_means <- merge(ALC_mean, FC_all_means)
FC_all_means <- FC_all_means %>% melt()
FC_all_means$TubeType <- ifelse(FC_all_means$Tube %in% c("DNA Streck", "Biomatrica", "RNA Streck", "PAXgene", "Roche"), "preservation", "non-preservation")
p <- ggplot(FC_all_means, aes(x = reorder(variable, value, FUN = function(x) -mean(x, na.rm = TRUE)), y = value, group = Tube, color = Tube))+geom_vline(xintercept = c(1,2,3,4,5), linetype = "dashed", color = "grey") + geom_line() + geom_point() + theme_point + facet_wrap(~TubeType) + labs(title = "summary of fold changes (small RNA)", y = "mean FC", x = NULL, col = NULL) + scale_color_manual(values=color_panel) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_x_discrete(labels=c("ALC_FC" = "ALC", "miRNAfract_FC" = "miRNA biotype", "hemolysis_FC" = "hemolysis",
"miRNAcount_FC" = "miRNA count", "EndovsRC_FC" = "concRNA")) + scale_y_continuous(trans = "log2")
ggplotly(p)FC_all_means$TubeType <- sub("P", "p", FC_all_means$TubeType)
FC_all_means$TubeType <- sub("N", "n", FC_all_means$TubeType)
FC_all_means <- FC_all_means %>% mutate(variable = factor(variable, levels=c("miRNAcount_FC", "EndovsRC_FC", "hemolysis_FC", "miRNAfract_FC", "ALC")))
p <- ggplot(FC_all_means, aes(x = variable, y = value, group = Tube, color = Tube))+geom_vline(xintercept = c(1,2,3,4,5), linetype = "dashed", color = "grey") + geom_line() + geom_point() + theme_point + facet_wrap(~TubeType) + labs(title = "summary of fold changes (small RNA)", y = "mean FC", x = NULL, col = NULL) + scale_color_manual(values=color_panel) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_x_discrete(labels=c("ALC_FC" = "ALC", "miRNAfract_FC" = "miRNA frac.", "hemolysis_FC" = "hemolysis", "miRNAcount_FC" = "number of miRNAs", "EndovsRC_FC" = "RNA concentration")) + scale_y_continuous(trans = "log2")
ggsave("./plots/FC_all_summary.pdf", plot = ggplot2::last_plot(), dpi = 300, height = 5, width = 8)
ggsave("./plots/FC_all_summary.png", plot = ggplot2::last_plot(), dpi = 300, height = 5, width = 8)
ggsave("./plots/FC_all_summary.svg", plot = ggplot2::last_plot(), dpi = 300, height = 5, width = 8)figure_L <- ggarrange(
hemolysis_lineplot + labs(title = "hemolysis in PFP", col = NULL, x = ""),
RNAconc_lineplot + labs(title = "ratio endogenous counts vs RC spike-in", col = NULL, y = "endogenous/RC"),
miRNACount_lineplot + labs(title = "absolute number of miRNAs", col = NULL, y = "number of miRNAs"),
biotype_lineplot + labs(title = "evolution of miRNA count fraction", col = NULL, y = "miRNA counts / total counts * 100"),
labels = c("A", "B", "C", "D", "E"),
common.legend = TRUE, legend = "bottom", ncol = 2, nrow = 2
)
ggsave(figure_L, filename = "./plots/line_individ_overview.png", dpi = 300, height=12, width=12)
ggsave(figure_L, filename = "./plots/line_individ_overview.pdf", dpi = 300, height=12, width=12)
ggsave(figure_L, filename = "./plots/line_individ_overview.svg", dpi = 300, height=12, width=12)
figure_R <- ggarrange(
hemolysis_FC_plot + labs(subtitle = NULL),
RNAconc_FC_plot + labs(subtitle = NULL, title = "fold change of RNA conc. based on RC spikes"),
miRNACount_FC_plot + labs(subtitle = NULL, title = "fold change of number of miRNA"),
ALC_FC_plot +
labs(subtitle = NULL, title = "area left of the curve", x = "tube") +
theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1)),
biotype_FC_plot +
labs(subtitle = NULL, title = "fold change of miRNA count percentage"), labels = c("A", "B", "C", "D", "E"),
common.legend = TRUE, legend = "bottom", ncol = 2, nrow = 3
)
ggsave(figure_R, filename = "./plots/FC_individ_overview.png", dpi = 300, height=12, width=9)
ggsave(figure_R, filename = "./plots/FC_individ_overview.pdf", dpi = 300, height=12, width=9)
ggsave(figure_R, filename = "./plots/FC_individ_overview.svg", dpi = 300, height=12, width=9)## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gdtools_0.2.2 plyr_1.8.6 forcats_0.5.0 stringr_1.4.0
## [5] dplyr_1.0.2 purrr_0.3.4 readr_1.4.0 tidyr_1.1.2
## [9] tibble_3.0.4 tidyverse_1.3.0 DT_0.16 ggpubr_0.4.0
## [13] ggExtra_0.9 gridExtra_2.3 ggrepel_0.8.2 plotly_4.9.2.1
## [17] RCurl_1.98-1.2 scales_1.1.1 RColorBrewer_1.1-2 broom_0.7.2
## [21] readxl_1.3.1 ggbeeswarm_0.6.0 ggplot2_3.3.2 reshape2_1.4.4
## [25] biomaRt_2.46.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 ggsignif_0.6.0 ellipsis_0.3.1
## [4] rio_0.5.16 fs_1.5.0 rstudioapi_0.11
## [7] farver_2.0.3 bit64_4.0.5 AnnotationDbi_1.52.0
## [10] fansi_0.4.1 lubridate_1.7.9 xml2_1.3.2
## [13] splines_4.0.3 knitr_1.30 jsonlite_1.7.1
## [16] Cairo_1.5-12.2 dbplyr_1.4.4 shiny_1.5.0
## [19] compiler_4.0.3 httr_1.4.2 backports_1.1.10
## [22] Matrix_1.2-18 assertthat_0.2.1 fastmap_1.0.1
## [25] lazyeval_0.2.2 cli_2.1.0 later_1.1.0.1
## [28] htmltools_0.5.0 prettyunits_1.1.1 tools_4.0.3
## [31] gtable_0.3.0 glue_1.4.2 rappdirs_0.3.1
## [34] Rcpp_1.0.5 carData_3.0-4 Biobase_2.50.0
## [37] cellranger_1.1.0 vctrs_0.3.4 svglite_1.2.3.2
## [40] nlme_3.1-149 crosstalk_1.1.0.1 xfun_0.19
## [43] openxlsx_4.2.3 rvest_0.3.6 mime_0.9
## [46] miniUI_0.1.1.1 lifecycle_0.2.0 rstatix_0.6.0
## [49] XML_3.99-0.5 hms_0.5.3 promises_1.1.1
## [52] parallel_4.0.3 yaml_2.2.1 curl_4.3
## [55] memoise_1.1.0 stringi_1.5.3 RSQLite_2.2.1
## [58] S4Vectors_0.28.0 BiocGenerics_0.36.0 zip_2.1.1
## [61] systemfonts_0.3.2 rlang_0.4.8 pkgconfig_2.0.3
## [64] bitops_1.0-6 lattice_0.20-41 evaluate_0.14
## [67] htmlwidgets_1.5.2 labeling_0.4.2 cowplot_1.1.0
## [70] bit_4.0.4 tidyselect_1.1.0 magrittr_1.5
## [73] R6_2.5.0 IRanges_2.24.0 generics_0.1.0
## [76] DBI_1.1.0 pillar_1.4.6 haven_2.3.1
## [79] foreign_0.8-80 withr_2.3.0 mgcv_1.8-33
## [82] abind_1.4-5 modelr_0.1.8 crayon_1.3.4
## [85] car_3.0-10 BiocFileCache_1.14.0 rmarkdown_2.5
## [88] progress_1.2.2 grid_4.0.3 data.table_1.13.2
## [91] blob_1.2.1 reprex_0.3.0 digest_0.6.27
## [94] xtable_1.8-4 httpuv_1.5.4 openssl_1.4.3
## [97] stats4_4.0.3 munsell_0.5.0 beeswarm_0.2.3
## [100] viridisLite_0.3.0 vipor_0.4.5 askpass_1.1